In [1]:
import sys
sys.path.append('/Users/erickpeirson/tethne')

In [2]:
import matplotlib.pyplot as plt

1. Create a corpus from a JSTOR DfR dataset

1.1. Load bibliographic data


In [3]:
from tethne.readers import dfr

In [4]:
datapath = ['/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.cHrmED8A',
            '/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9',
            '/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9']

In [5]:
outpath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldaout'
temppath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldatemp'

In [6]:
papers = [ p for path in datapath for p in dfr.read(path) ]

In [7]:
len(papers)


Out[7]:
880

1.2. Load wordcounts


In [8]:
wordcounts = {}
for path in datapath:
    w = dfr.ngrams(path, 'uni')
    wordcounts.update(w)

1.3. Load NLTK stoplist


In [9]:
from nltk.corpus import stopwords

In [10]:
stoplist = stopwords.words()

1.4. Create a Corpus


In [11]:
from tethne import Corpus

In [12]:
D = Corpus(papers, features={'wordcounts': wordcounts}, index_by='doi', exclude=stoplist)

1.5. Filter words in wordcount featureset


In [13]:
def filt(s, C, DC):
    if C > 3 and DC > 1 and len(s) > 3:
        return True
    return False

In [14]:
D.filter_features('wordcounts', 'wordcounts_filtered', filt)

In [15]:
len(D.features['wordcounts']['index']), len(D.features['wordcounts_filtered']['index'])


Out[15]:
(122836, 27750)

1.6. Create a time-period index


In [16]:
D.slice('date', method='time_period', window_size=5)

In [17]:
D.plot_distribution('date')


[1921, 1926, 1931, 1936, 1941, 1946, 1951, 1956, 1961, 1966, 1971, 1976]

In [18]:
D.slice('jtitle')

In [21]:
D.plot_distribution('date', 'jtitle', aspect=0.1, interpolation='none')


[1921, 1926, 1931, 1936, 1941, 1946, 1951, 1956, 1961, 1966, 1971, 1976]
[0, 2, 4, 6, 8, 10]

In [23]:
D.get_by([('date',1946), ('date',1951)], include_papers=False)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-23-22d13fb7070d> in <module>()
----> 1 D.get_by([('date',1946), ('date',1951)])

/Users/erickpeirson/tethne/tethne/classes/corpus.py in get_by(self, key_indices, include_papers)
    820             A list of paper indices, or :class:`.Paper` instances.
    821 
--> 822         """
    823 
    824         if len(self.axes) == 0:

NameError: global name 'papers' is not defined

In [19]:
from tethne.model.managers import DTMModelManager

In [20]:
dtm_path = '/Users/erickpeirson/tethne/tethne/model/bin/main'

In [21]:
dtm_outpath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmout'
dtm_temppath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmtemp'

In [22]:
DM = DTMModelManager(D, feature='wordcounts_filtered', outpath=dtm_outpath, 
                        temppath=dtm_temppath, dtm_path=dtm_path)

In [23]:
DM.prep()

In [24]:
DM.build()


Out[24]:
<tethne.model.corpus.dtmmodel.DTMModel at 0x10db6d1d0>

In [25]:
import cPickle as pickle

In [26]:
with open('/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmout/DTMModelManager.pickle', 'wb') as f:
    pickle.dump(DM, f)

In [28]:
DM.list_topic_diachronic(1)


Out[28]:
{0: ['pollen',
  'species',
  'flowers',
  'embryo',
  'grains',
  'plants',
  'number',
  'perenne',
  'flower',
  'pollination'],
 1: ['pollen',
  'species',
  'flowers',
  'embryo',
  'grains',
  'plants',
  'number',
  'perenne',
  'flower',
  'pollination'],
 2: ['pollen',
  'species',
  'flowers',
  'embryo',
  'grains',
  'plants',
  'number',
  'perenne',
  'flower',
  'subsp'],
 3: ['pollen',
  'species',
  'flowers',
  'embryo',
  'grains',
  'plants',
  'number',
  'perenne',
  'flower',
  'subsp'],
 4: ['pollen',
  'species',
  'flowers',
  'grains',
  'plants',
  'number',
  'embryo',
  'perenne',
  'subsp',
  'flower'],
 5: ['pollen',
  'species',
  'flowers',
  'plants',
  'grains',
  'number',
  'perenne',
  'subsp',
  'embryo',
  'flower'],
 6: ['pollen',
  'species',
  'flowers',
  'perenne',
  'plants',
  'number',
  'subsp',
  'grains',
  'female',
  'flower'],
 7: ['pollen',
  'species',
  'perenne',
  'flowers',
  'subsp',
  'plants',
  'number',
  'female',
  'chromosome',
  'grains'],
 8: ['pollen',
  'species',
  'flowers',
  'plants',
  'female',
  'number',
  'chromosome',
  'male',
  'subsp',
  'perenne'],
 9: ['species',
  'pollen',
  'female',
  'plants',
  'number',
  'flowers',
  'male',
  'chromosome',
  'pollination',
  'table'],
 10: ['species',
  'pollen',
  'chromosome',
  'number',
  'plants',
  'female',
  'flowers',
  'male',
  'table',
  'pollination'],
 11: ['species',
  'pollen',
  'chromosome',
  'number',
  'plants',
  'female',
  'table',
  'flowers',
  'numbers',
  'male']}

In [30]:
import networkx as nx

In [43]:
g = nx.Graph(name='my graph')

In [47]:
g.add_edge(1,3, weight=0.5)

In [49]:
g.add_node(1, size=0.3)

In [50]:
g.__dict__


Out[50]:
{'adj': {0: {3: {'weight': 0.5}},
  1: {3: {'weight': 0.5}},
  3: {0: {'weight': 0.5}, 1: {'weight': 0.5}}},
 'edge': {0: {3: {'weight': 0.5}},
  1: {3: {'weight': 0.5}},
  3: {0: {'weight': 0.5}, 1: {'weight': 0.5}}},
 'graph': {'name': 'my graph'},
 'node': {0: {}, 1: {'size': 0.3}, 3: {}}}

In [51]:
from scipy.sparse import coo_matrix

In [191]:
I = [0,1,2,3,3]
J = [1,1,3,0,1]
K = [1, 2, 3, 4, 5]

In [192]:
A = coo_matrix((K, (I, J)))

In [211]:
zip(A.nonzero()[0], A.nonzero()[1])


Out[211]:
[(0, 1), (1, 1), (2, 3), (3, 0), (3, 1)]

In [194]:
B = A.tocsr()

In [195]:
B[0,1]


Out[195]:
1

In [196]:
C = A.tolil()

In [197]:
A.nonzero()[0]


Out[197]:
array([0, 1, 2, 3, 3], dtype=int32)

In [198]:
C.nonzero()


Out[198]:
(array([0, 1, 2, 3, 3], dtype=int32), array([1, 1, 3, 0, 1], dtype=int32))

In [203]:
list(set(B[0,:].nonzero()[1]) | set(B[:,0].nonzero()[0]))


Out[203]:
[1, 3]

In [200]:
A.nonzero()


Out[200]:
(array([0, 1, 2, 3, 3], dtype=int32), array([1, 1, 3, 0, 1], dtype=int32))

In [201]:
class SA(object):
    def __getitem__(self, indices):
        i, j = indices
        print i, j

In [202]:
SA()[0,1]


0 1

In [214]:
g.edges(data=True)


Out[214]:
[(0, 3, {'weight': 0.5}), (1, 3, {'weight': 0.5})]

In [216]:
A.data


Out[216]:
array([1, 2, 3, 4, 5])

In [218]:
g.edge


Out[218]:
{0: {3: {'weight': 0.5}},
 1: {3: {'weight': 0.5}},
 3: {0: {'weight': 0.5}, 1: {'weight': 0.5}}}

In [219]:
from tethne.persistence.hdf5.graphcollection import HDF5Graph


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-219-b621458adbac> in <module>()
----> 1 from tethne.persistence.hdf5.graphcollection import HDF5Graph

ImportError: No module named graphcollection

In [ ]: